%load_ext pretty_jupyter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.stats import chi2_contingency
from collections import Counter
from utils import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import shap
The pretty_jupyter extension is already loaded. To reload it, use:
  %reload_ext pretty_jupyter
filename = "./O_G_Equipment_Data.xlsx"
data = pd.read_excel(filename)

Exploratory analysis

data.head()
Cycle Preset_1 Preset_2 Temperature Pressure VibrationX VibrationY VibrationZ Frequency Fail
0 1 3 6 44.235186 47.657254 46.441769 64.820327 66.454520 44.483250 False
1 2 2 4 60.807234 63.172076 62.005951 80.714431 81.246405 60.228715 False
2 3 2 1 79.027536 83.032190 82.642110 98.254386 98.785196 80.993479 False
3 4 2 3 79.716242 100.508634 122.362321 121.363429 118.652538 80.315567 False
4 5 2 5 39.989054 51.764833 42.514302 61.037910 50.716469 64.245166 False
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Cycle        800 non-null    int64  
 1   Preset_1     800 non-null    int64  
 2   Preset_2     800 non-null    int64  
 3   Temperature  800 non-null    float64
 4   Pressure     800 non-null    float64
 5   VibrationX   800 non-null    float64
 6   VibrationY   800 non-null    float64
 7   VibrationZ   800 non-null    float64
 8   Frequency    800 non-null    float64
 9   Fail         800 non-null    bool   
dtypes: bool(1), float64(6), int64(3)
memory usage: 57.2 KB
None

How many times the equipment has failed?

print(f"The equipment has failed {data['Fail'].value_counts().iloc[1]} times")
print(f"The failure rate is {data['Fail'].value_counts().iloc[1]/len(data)*100:.2f}%")
fails_conts = data['Fail'].value_counts()

labels = ['False', 'True']
counts = [fails_conts[False], fails_conts[True]]

fig, ax = plt.subplots()
ax.bar(labels, counts, color=['green', 'red'])

ax.set_ylabel('Count')
ax.set_title('Comparison of Fail and Not Fail Equipment')

for i in range(len(counts)):
    ax.text(i, counts[i] + 1, str(counts[i]), ha='center')
ax.grid()

plt.savefig('fail_not_fail.png', dpi=300 , bbox_inches='tight')
The equipment has failed 66 times
The failure rate is 8.25%

Categorize failures by Presets

fails_per_preset_1 = data.groupby(['Preset_1', 'Fail']).size().unstack().sort_values(by=True, ascending=False)
fails_per_preset_2 = data.groupby(['Preset_2', 'Fail']).size().unstack().sort_values(by=True, ascending=False)

display_side_by_side([fails_per_preset_1, fails_per_preset_2], ['Fails per Preset 1', 'Fails per Preset 2'])
Fails per Preset 1
Fail False True
Preset_1    
1 237 27
2 260 21
3 237 18
   
Fails per Preset 2
Fail False True
Preset_2    
5 88 12
1 84 11
2 92 9
6 92 9
7 100 9
8 93 7
3 95 6
4 90 3
   
# fails_per_preset_2_filled = fails_per_preset_2.fillna(0)

# chi2, p, dof, expected = chi2_contingency(fails_per_preset_2_filled)

How many combinations of Presets?

Here we want to evaluate how the combination of presets affecsts the equipment failures.

combination = data.groupby(['Preset_1','Preset_2', 'Fail']).size().unstack(fill_value=0)
combination['Percentage_fail'] = combination[True] / (combination[True] + combination[False]) * 100
combination['cumulative'] = combination[True].cumsum()
combination.reset_index(inplace=True)
combination.reset_index(inplace=True)
combination_sorted =combination.sort_values(by=True, ascending=False)
combination_sorted
Fail index Preset_1 Preset_2 False True Percentage_fail cumulative
1 1 1 2 33 5 13.157895 9
4 4 1 5 26 5 16.129032 18
0 0 1 1 30 4 11.764706 4
20 20 3 5 25 4 13.793103 59
6 6 1 7 34 4 10.526316 25
8 8 2 1 26 4 13.333333 31
15 15 2 8 33 4 10.810811 48
22 22 3 7 31 3 8.823529 65
21 21 3 6 27 3 10.000000 62
16 16 3 1 28 3 9.677419 51
13 13 2 6 34 3 8.108108 42
12 12 2 5 37 3 7.500000 39
5 5 1 6 31 3 8.823529 21
10 10 2 3 24 2 7.692308 35
14 14 2 7 35 2 5.405405 44
9 9 2 2 32 2 5.882353 33
7 7 1 8 22 2 8.333333 27
17 17 3 2 27 2 6.896552 53
18 18 3 3 30 2 6.250000 55
3 3 1 4 20 2 9.090909 13
2 2 1 3 41 2 4.651163 11
11 11 2 4 39 1 2.500000 36
23 23 3 8 38 1 2.564103 66
19 19 3 4 31 0 0.000000 55

Let's see now if the failues counts, has same relation with the cycles. The data was combined acoording to the pressets, and If there was or not a failure. The table in this first moment has the same order as the cyles running. The point here is to see if there is any point in time were we can see a tendency of growth or reduction in the cummulative events.

plots = sns.lmplot(x='index', y='cumulative', data=combination, fit_reg=True, legend=True, scatter_kws={'color': 'black'}, line_kws={'color': 'red'})
plots.set(title='Cumulative Failures')
plots.set(xlabel='Presset Combination', ylabel='Cumulative Failures')
plt.grid()
plt.savefig('cumulative_failures.png', dpi=300, bbox_inches='tight')

By looking the picture we just can see that the Failues happens during all the period, and the cumulative sum of failues increases over time. The cummulative curve is pratically linear. Although the curve is well lienarly fitted, not showing clear relation bewteen the presets and the failures, we can observe that around the combination 20, 11, 4,and 23, there is a little drop relatively to the curve. This could indicate that those presets could be countrinuing to a reduction in the cummulative sum. In other words, those combinations can have lower counts of failures.

 

We can see by looking the table that the combination of presets 3 and 4, had no failures. Also, the percentages of failues for 3-8 and 2-4 were the lowest.

Distribution of values

One way to evaluate the contribution of each variable in equipment failure is the distribution of its intensity. The Figure below show the distribution for each of the variables.

parameters = ['Temperature', 'Pressure','VibrationX', 'VibrationY', 'VibrationZ', 'Frequency']
fig, ax = plt.subplots(3, 2, figsize=(15, 15))
axs = ax.T.flatten()
for axx in range(len(axs)):
    g = sns.histplot(data, x=parameters[axx], kde=True, element='step', legend=True, ax=axs[axx], common_norm=False)
    leg = g.axes.get_legend()
    axs[axx].set_xlabel(parameters[axx])
    axs[axx].set_ylabel('Count')
    new_title = 'Fail'
plt.savefig('histograms.png', dpi=300, bbox_inches='tight')

The Figure show the distribution of variables measurementes. All the sensors, expect the VibrationX, present a shape more like a bimodal distribution. It can be realated to a subgroup of values. Splitting the data in Failure and not Failure can give us a better understanding.

fig, ax = plt.subplots(3, 2, figsize=(15, 15))

axs = ax.T.flatten()

for axx in range(len(axs)):
    g = sns.histplot(data, x=parameters[axx], hue='Fail', kde=True, element='step', legend=True,stat='density', ax=axs[axx], common_norm=False)
    leg = g.axes.get_legend()
    axs[axx].set_xlabel(parameters[axx])
    axs[axx].set_ylabel('Density')
    new_title = 'Fail'
    leg.set_title(new_title)
    new_labels = ['False', 'True']
    for t, l in zip(leg.texts, new_labels):
        t.set_text(l)

plt.savefig('histograms_separated.png', dpi=300, bbox_inches='tight')

The distributions in the Figure above shows a clear separation in the Failures and not Failure measurements. We can see that the measurements have higher values when the failures happens. The VibrationZ, VibrationY, and Pressure were the ones that also have a tail with larger values.

Resampling the data

newDF = oversamplig_dataframe(data, 'Fail', method='ADASYN')

Modeling prediciton

X = newDF.drop(columns=['Cycle','Preset_1', 'Preset_2', 'Fail'])
y = newDF.drop(columns=['Cycle', 'Preset_1', 'Preset_2', 'Temperature', 'Pressure', 'VibrationX', 'VibrationY', 'VibrationZ', 'Frequency'])
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.20, random_state=0)
ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)

ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)
model = LogisticRegression()

model.fit(X_train, y_train)
predictions = model.predict(X_test)
cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
True Positive(TP)  =  137
False Positive(FP) =  12
True Negative(TN)  =  147
False Negative(FN) =  0
accuracy =  (TP + TN) / (TP + FP + TN + FN)

print('Accuracy of the binary classifier = {:0.3f}'.format(accuracy))
Accuracy of the binary classifier = 0.959

Variable importance (Explainability)

X_test_df = pd.DataFrame(X_test, columns=X.columns)
explainer = shap.Explainer(model.predict, X_test)
shap_values = explainer(X_test_df)
shap.summary_plot(shap_values, plot_type='violin', show=False)
plt.savefig('shap_summary_plot.png', dpi=300, bbox_inches='tight')
models = {}

# Logistic Regression
models['Logistic Regression'] = LogisticRegression()
# Support Vector Machines
models['Support Vector Machines'] = LinearSVC()
# Decision Trees
models['Decision Trees'] = DecisionTreeClassifier()
# Random Forest
models['Random Forest'] = RandomForestClassifier()
# Naive Bayes
models['Naive Bayes'] = GaussianNB()
# K-Nearest Neighbors
models['K-Nearest Neighbor'] = KNeighborsClassifier()
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    
    # Fit the classifier
    models[key].fit(X_train, y_train)
    
    # Make predictions
    predictions = models[key].predict(X_test)
    
    # Calculate metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)
import pandas as pd

df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model
Accuracy Precision Recall
Logistic Regression 0.959459 1.000000 0.919463
Support Vector Machines 0.959459 1.000000 0.919463
Decision Trees 0.925676 0.956204 0.891156
Random Forest 0.976351 1.000000 0.951389
Naive Bayes 0.922297 0.992701 0.860759
K-Nearest Neighbor 0.956081 1.000000 0.913333
ax = df_model.plot.barh()
ax.legend(
    ncol=len(models.keys()), 
    bbox_to_anchor=(0, 1), 
    loc='lower left', 
    prop={'size': 14}
)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
 
 
 
falhas = data[data['Fail'] == True]['Cycle']
fig, axs = plt.subplots(6, 1, figsize=(18, 20), sharex=True, dpi=200)
axs[0].plot(data['Cycle'], data['VibrationX'], label='VibrationX', color='blue')
axs[1].plot(data['Cycle'], data['VibrationY'], label='VibrationY', color='red')
axs[2].plot(data['Cycle'], data['VibrationZ'], label='VibrationZ', color='green')
axs[3].plot(data['Cycle'], data['Temperature'], label='VibrationX', color='black')
axs[4].plot(data['Cycle'], data['Pressure'], label='VibrationX', color='gray')
axs[5].plot(data['Cycle'], data['Frequency'], label='Frequency', color='gray')


for ax in axs:
    for ciclo in falhas:
        ax.axvline(x=ciclo, color='k', linestyle='--', linewidth=1)

axs[0].set_ylabel('VibrationX')
axs[1].set_ylabel('VibrationY')
axs[2].set_ylabel('VibrationZ')
axs[3].set_ylabel('Temperature')
axs[4].set_ylabel('Pressure')
axs[4].set_xlabel('Cycle')
axs[5].set_xlabel('Frequency')

axs[0].set_title('VibrationX')
axs[1].set_title('VibrationY')
axs[2].set_title('VibrationZ')
axs[3].set_title('Temperature')
axs[4].set_title('Pressure')
axs[5].set_title('Frequency')


plt.savefig('Vibration_Analysis.png', dpi=200, bbox_inches='tight')
import numpy as np
import pywt
import matplotlib.pyplot as plt

# Selecionar as componentes para análise
vibration_x = data['VibrationX'].values
vibration_y = data['VibrationY'].values
vibration_z = data['VibrationZ'].values
pressure = data['Pressure'].values
temperature = data['Temperature'].values

scales = np.arange(1, 128)

coef_x, freqs_x = pywt.cwt(vibration_x, scales, 'cmor1.5-0.5')
coef_y, freqs_y = pywt.cwt(vibration_y, scales, 'cmor1.5-0.5')
coef_z, freqs_z = pywt.cwt(vibration_z, scales, 'cmor1.5-0.5')
coef_pres, freqs_pres = pywt.cwt(pressure, scales, 'cmor1.5-0.5')
coef_temp, freqs_temp = pywt.cwt(temperature, scales, 'cmor1.5-0.5')

fig, axs = plt.subplots(5, 1, figsize=(16, 14), sharex=True)
cax1 = axs[0].pcolormesh(np.arange(len(vibration_x)), freqs_x, np.abs(coef_x), cmap='plasma')
fig.colorbar(cax1, ax=axs[0])
axs[0].set_title('VibrationX')
axs[0].set_ylabel('Frequency')
axs[0].set_yscale('log', base=2)

cax2 = axs[1].pcolormesh(np.arange(len(vibration_y)), freqs_y, np.abs(coef_y), cmap='plasma')
fig.colorbar(cax2, ax=axs[1])
axs[1].set_title('VibrationY')
axs[1].set_ylabel('Frequency')
axs[1].set_yscale('log', base=2)

cax3 = axs[2].pcolormesh(np.arange(len(vibration_z)), freqs_z, np.abs(coef_z), cmap='plasma')
fig.colorbar(cax3, ax=axs[2])
axs[2].set_title('VibrationZ')
axs[2].set_ylabel('Frequency')
axs[2].set_yscale('log', base=2)

cax4 = axs[3].pcolormesh(np.arange(len(pressure)), freqs_pres, np.abs(coef_pres), cmap='plasma')
fig.colorbar(cax4, ax=axs[3])
axs[3].set_title('Pressure')
axs[3].set_ylabel('Frequency')
axs[3].set_yscale('log', base=2)

cax5 = axs[4].pcolormesh(np.arange(len(temperature)), freqs_temp, np.abs(coef_temp), cmap='plasma')
fig.colorbar(cax5, ax=axs[4])
axs[4].set_title('Temperature')
axs[4].set_ylabel('Frequency')
axs[4].set_yscale('log', base=2)

axs[4].set_xlabel('Cycle')

plt.tight_layout()
plt.savefig('wavelet_Analysis.png', dpi=200, bbox_inches='tight')
pywt.scale2frequency('cmor1.5-0.5',scales)
array([0.5       , 0.25      , 0.16666667, 0.125     , 0.1       ,
       0.08333333, 0.07142857, 0.0625    , 0.05555556, 0.05      ,
       0.04545455, 0.04166667, 0.03846154, 0.03571429, 0.03333333,
       0.03125   , 0.02941176, 0.02777778, 0.02631579, 0.025     ,
       0.02380952, 0.02272727, 0.02173913, 0.02083333, 0.02      ,
       0.01923077, 0.01851852, 0.01785714, 0.01724138, 0.01666667,
       0.01612903, 0.015625  , 0.01515152, 0.01470588, 0.01428571,
       0.01388889, 0.01351351, 0.01315789, 0.01282051, 0.0125    ,
       0.01219512, 0.01190476, 0.01162791, 0.01136364, 0.01111111,
       0.01086957, 0.0106383 , 0.01041667, 0.01020408, 0.01      ,
       0.00980392, 0.00961538, 0.00943396, 0.00925926, 0.00909091,
       0.00892857, 0.00877193, 0.00862069, 0.00847458, 0.00833333,
       0.00819672, 0.00806452, 0.00793651, 0.0078125 , 0.00769231,
       0.00757576, 0.00746269, 0.00735294, 0.00724638, 0.00714286,
       0.00704225, 0.00694444, 0.00684932, 0.00675676, 0.00666667,
       0.00657895, 0.00649351, 0.00641026, 0.00632911, 0.00625   ,
       0.00617284, 0.00609756, 0.0060241 , 0.00595238, 0.00588235,
       0.00581395, 0.00574713, 0.00568182, 0.00561798, 0.00555556,
       0.00549451, 0.00543478, 0.00537634, 0.00531915, 0.00526316,
       0.00520833, 0.00515464, 0.00510204, 0.00505051, 0.005     ,
       0.0049505 , 0.00490196, 0.00485437, 0.00480769, 0.0047619 ,
       0.00471698, 0.0046729 , 0.00462963, 0.00458716, 0.00454545,
       0.0045045 , 0.00446429, 0.00442478, 0.00438596, 0.00434783,
       0.00431034, 0.0042735 , 0.00423729, 0.00420168, 0.00416667,
       0.00413223, 0.00409836, 0.00406504, 0.00403226, 0.004     ,
       0.00396825, 0.00393701])